Pelagophyceae
Initialize
library(pr2database)
library(stringr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(tibble)
library(readr)
library(maps)
library(cowplot)
if(any(grepl("package:dvutils", search()))) detach("package:dvutils", unload=TRUE)
library("dvutils")
library(patchwork)
# data("pr2")## Warning: package 'knitr' was built under R version 3.5.3
PR2
Use the PR2 on Google
# Info for the local dabase
pr2_db <- db_info("pr2_google")
pr2_db_con <- db_connect(pr2_db)
pr2_main <- tbl(pr2_db_con, "pr2_main") %>%
filter (is.na(removed_version)) %>%
collect()
pr2_seq <- tbl(pr2_db_con, "pr2_sequences")%>% collect()
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>%
filter (is.na(taxo_removed_version)) %>%
collect()
pr2_metadata <- tbl(pr2_db_con, "pr2_metadata")%>%
collect()
# Join the tables and keep only sequences that are not removed
pr2 <- pr2_main %>%
left_join(pr2_taxo, by = c("species"="species")) %>%
left_join (pr2_seq) %>%
left_join (pr2_metadata)
db_disconnect(pr2_db_con)
pr2 <- pr2 %>%
filter (is.na(removed_version))Export Pelagophyceae
pr2_export <- pr2 %>%
filter(class == "Pelagophyceae") %>%
arrange(class, order, family, genus, species)
pr2_export(pr2_export, "../pr2/pr2_4_12_Pelagophyceae.fasta.gz", file_type = "fasta", file_format = "fasta_taxo_long")
pr2_export(pr2_export, "../pr2/pr2_4_12_Pelagophyceae.xlsx", file_type = "merged_excel")
pr2_export_taxonomy <- pr2_export %>%
count(class, order, family, genus, species)
knitr::kable(pr2_export_taxonomy)
openxlsx::write.xlsx(pr2_export_taxonomy, file="../pr2/pr2_Pelagoophyceae_taxonomy.xlsx", na="")RCC
Read RCC
db_rcc <- db_connect(db_info("rcc_local") )
cultures <- tbl(db_rcc, "cultures") %>%
collect()
taxonomy <- tbl(db_rcc, "taxonomy") %>%
collect()
cultures <- left_join(cultures, taxonomy)
# cultures <- cultures %>% filter(lost==0)
cultures <- cultures %>% mutate(year_entered = lubridate::year(date_entered_catalog),
year_sampled = lubridate::year(lubridate::parse_date_time(sampling_date, "ymd")),
year_lost = lubridate::year(lost_date),
latitude = mapply(lat_long_dec,sampling_lat_deg,sampling_lat_mn,sampling_lat_ns,"latitude"),
longitude = mapply(lat_long_dec,sampling_long_deg,sampling_long_mn,sampling_long_ew,"longitude"))
sequences <- tbl(db_rcc, "sequences") %>% collect()
db_disconnect(db_rcc)
# Build some tables -------------------------------------------------------
sequences_18S <- sequences %>%
filter(str_detect(gene_name, "18S")) %>%
select(rcc_id, genbank_accession, sequence) %>%
rename(accession_18S = genbank_accession, sequence_18S = sequence) %>%
mutate(sequence_18S_length = str_length(sequence_18S))
sequences_18S_longer <- sequences_18S %>%
arrange(rcc_id, desc(sequence_18S_length)) %>%
distinct(rcc_id, .keep_all = TRUE)
sequences_ITS <- sequences %>%
filter(str_detect(gene_name, "ITS")) %>%
select(rcc_id, genbank_accession, sequence) %>%
rename(accession_ITS = genbank_accession, sequence_ITS = sequence)Construct and export tables
cultures_Pelago <- cultures %>%
# filter(lost==0) %>%
filter(class == "Pelagophyceae") %>%
arrange(order, family, genus, species, rcc_id) %>%
select(order, family, genus, species, rcc_id, lost_date,
strain_name, strain_name_synonyms,
sampling_ocean, sampling_regional_sea , sampling_depth,
sampling_date, latitude, longitude) %>%
left_join(sequences_18S_longer) %>%
left_join(sequences_ITS)
openxlsx::write.xlsx(cultures_Pelago, "../rcc/RCC_Pelago.xlsx", na="")Export fasta
cultures_Pelago_18S <- cultures_Pelago %>%
filter(!is.na(accession_18S)) %>%
mutate(sequence_name = str_c(accession_18S, str_c("RCC", rcc_id), strain_name,species, sep="|" ))
seq_out <- DNAStringSet(cultures_Pelago_18S$sequence_18S) # Store the sequence in a DNAString
names(seq_out) <- cultures_Pelago_18S$sequence_name
writeXStringSet(seq_out, "C:/Users/vaulot/Desktop/RCC_Pelago_18S.fasta", compress=FALSE) Metabarcodes
Datasets included
All public data sets
datasets <- metapr2_export_datasets() %>%
filter(public == 1) %>%
filter(processing_pipeline_metapr2 %in% c("dada2", "swarm"))
datasets %>%
select(dataset_id, dataset_code, dataset_name, gene_region) %>%
arrange(gene_region) %>%
knitr::kable() | dataset_id | dataset_code | dataset_name | gene_region |
|---|---|---|---|
| 1 | OSD_2014_V4_LGC | Ocean Sampling Day - 2014 - V4 LGC | V4 |
| 2 | OSD_2015_V4 | Ocean Sampling Day - 2015 - V4 | V4 |
| 3 | OSD_2014_V4_LW | Ocean Sampling Day - 2014 - V4 LW | V4 |
| 5 | MALINA_Monier_2014 | Arctic Ocean, Beaufort Sea, MALINA cruise - 2009 | V4 |
| 6 | Arctic_Stecher_2016 | Central Arctic Ocean - 2012 | V4 |
| 9 | Nansen_Metfies_2016 | Nansen Basin - 2012 | V4 |
| 11 | Fieldes_Bay_Luo_2016 | Fieldes Bay, Antarctic - 2013 | V4 |
| 19 | Baltic_Sea_2012_2013 | Gulf of Finland - 2012-2013 | V4 |
| 20 | Oslo_fjord_2009_2011 | Oslo fjord - 2009-2011 | V4 |
| 33 | Tsunami_18S | Tsunami deposits 18S R1 | V4 |
| 34 | Malaspina_vertical_V4 | Malaspina expedition - vertical profiles - 2010-2011 | V4 |
| 35 | Malaspina_surface_V4 | Malaspina expedition - surface - 2010-2011 | V4 |
| 36 | Blanes_2004_2013 | Blanes Time Series - 2004-2013 | V4 |
| 37 | Baffin_Bay_Joli_2013 | Baffin Bay - 2013 | V4 |
| 38 | White_Sea_2013_2015 | White Sea - 2013-2015 | V4 |
| 39 | Arctic_Ocean_PS80_2012 | Arctic Ocean - Polarstern expedition ARK-XXVII/3 - 2012 | V4 |
| 40 | Arctic_Ocean_Survey | Arctic Ocean Survey - 2005-2011 | V4 |
| 41 | Chukchi_Sea_ICESCAPE | Chukchi Sea - ICESCAPE - 2010 | V4 |
| 42 | Arctic_Nares_Strait | Nares Strait - 2014 | V4 |
| 43 | Baltic_Gdansk_2012 | Gdansk Gulf - 2012 | V4 |
| 44 | Baltic_Gdansk_2012_Hapto | Gdansk Gulf - 2012 Hapto | V4 |
| 49 | Naples_2011 | Bay of Naples - 2011 | V4 |
| 15 | Tara_Oceans | Tara Oceans - 2009-2012 | V9 |
datasets_selected <- list()
datasets_selected[["all"]] <- datasets %>%
filter(dataset_id %in% c(1,2,15, 34, 35, 36))
cat("Data sets selected")Data sets selected
datasets_selected[["all"]] %>%
select(dataset_id, dataset_code, dataset_name, gene_region) %>%
arrange(gene_region) %>%
knitr::kable()| dataset_id | dataset_code | dataset_name | gene_region |
|---|---|---|---|
| 1 | OSD_2014_V4_LGC | Ocean Sampling Day - 2014 - V4 LGC | V4 |
| 2 | OSD_2015_V4 | Ocean Sampling Day - 2015 - V4 | V4 |
| 34 | Malaspina_vertical_V4 | Malaspina expedition - vertical profiles - 2010-2011 | V4 |
| 35 | Malaspina_surface_V4 | Malaspina expedition - surface - 2010-2011 | V4 |
| 36 | Blanes_2004_2013 | Blanes Time Series - 2004-2013 | V4 |
| 15 | Tara_Oceans | Tara Oceans - 2009-2012 | V9 |
Summarize function
- At species level
summarise_asv_set <- function(one_asv_set, directory) {
# one_asv_set <- filter(asv_set[["V4"]]$df, dataset_id != 36)
# directory = "../metabarcodes/V4/"
summary_species <- phyloseq_long_treemap(one_asv_set, genus, species, "Species" )
# Normalize by total number of Pelagophyceae reads
asv_class <- one_asv_set %>%
group_by(file_code) %>%
summarise(reads_class = sum(n_reads))
one_asv_set <- left_join(one_asv_set, asv_class) %>%
mutate (pct_class = n_reads/reads_class*100,
pct_total = n_reads/reads_total*100)
summary_pct <- one_asv_set %>%
group_by(species) %>%
summarise(pct_class = mean(pct_class, na.rm = TRUE),
pct_total = mean(pct_total, na.rm = TRUE))
summary_species <- left_join(summary_species$df, summary_pct)
print(knitr::kable(summary_species))
openxlsx::write.xlsx(summary_species, str_c(directory, "summary_species.xlsx"))
#
# # Map of number of samples where genus found
# asv_summary <- one_asv_set %>%
# distinct(file_code, genus, longitude, latitude, .keep_all = TRUE) %>%
# count(genus, species, longitude, latitude, name="n_samples")
#
# # print(knitr::kable(asv_summary))
world <- map_data("world")
g <- ggplot() +
geom_polygon(data = world, aes(x=long, y = lat, group = group), fill="grey") +
coord_fixed(1.3) +
geom_point(data=one_asv_set, aes(x=longitude, y=latitude, size=pct_class), fill="blue", shape=21) +
# scale_size_area(trans="log10") +
ggtitle("Percent of Pelagophyceae") +
facet_wrap(vars(species))
print(g)
for (one_species in unique(one_asv_set$species)) {
df <- one_asv_set %>%
filter(species == one_species) %>%
select(longitude, latitude, pct_class) %>%
rename(z=pct_class)
gg <- map_distribution(df, map_title = str_c(one_species," - % of Pelagophyceae"))
print(gg)
ggsave(plot= gg , filename=str_c(directory,"maps_pct_pelago/map_",one_species,"_pct_pelago.pdf"),
width = 15 , height = 12, scale=1.80,
units="cm", useDingbats=FALSE)
}
for (one_species in unique(one_asv_set$species)) {
df <- one_asv_set %>%
filter(species == one_species) %>%
select(longitude, latitude, pct_total) %>%
rename(z=pct_total)
gg <- map_distribution(df, map_title = str_c(one_species," - % of Total reads"),
z_limits = c(0, 50),
z_breaks = c(5, 10, 20, 40, 50))
print(gg)
ggsave(plot= gg , filename=str_c(directory,"maps_pct_total/map_",one_species,"_pct_total.pdf"),
width = 15 , height = 12, scale=1.80,
units="cm", useDingbats=FALSE)
}
return(TRUE)
}Get the data
# Export data
asv_set <- list()
for (one_region in regions) {
asv_set[[one_region]] <- metapr2_export_asv(taxo_level = class,
taxo_name="Pelagophyceae",
dataset_id_selected = datasets_selected[[one_region]]$dataset_id,
export_fasta = TRUE,
export_wide_xls = TRUE,
export_sample_xls = TRUE,
export_phyloseq = FALSE,
directory = str_c("../metabarcodes/",one_region, "/"),
taxonomy_full= TRUE,
boot_min = 100,
boot_level = class_boot,
use_hash = TRUE)
saveRDS(asv_set[[one_region]], str_c("../metabarcodes/", one_region, "/asv_set.rda"))
} asv_set <- list()
for (one_region in regions) {
asv_set[[one_region]] <- readRDS( str_c("../metabarcodes/", one_region, "/asv_set.rda"))
summarise_asv_set(filter(data.frame(asv_set[[one_region]]$df), dataset_id != 36), directory = str_c("../metabarcodes/", one_region, "/"))
}Joining, by = "file_code"
Joining, by = "species"
genus species n_reads pct_class pct_total
------------------------- ----------------------------- -------- ----------- ----------
Ankylochrysis Ankylochrysis_lutea 833 67.5159512 0.4905626
Ankylochrysis Ankylochrysis_sp. 104 55.0653595 0.0404306
Aureococcus Aureococcus_anophagefferens 25736 27.6070076 0.2275524
Aureoumbra Aureoumbra_lagunensis 50 36.1818525 0.0583039
Chrysocystis Chrysocystis_fragilis 1172 26.5399034 0.1376429
Chrysoreinhardia Chrysoreinhardia_sp. 2 0.0042324 0.0020712
Pelagococcus Pelagococcus_sp. 5418 23.0231659 0.0699292
Pelagococcus Pelagococcus_subviridis 1 0.1996008 0.0010934
Pelagomonadaceae_clade_A Pelagomonadaceae_clade_A_sp. 17688 26.6221946 0.1094061
Pelagomonadaceae_X Pelagomonadaceae_X_sp. 286 1.6478726 0.0124078
Pelagomonas Pelagomonas_calceolata 614671 53.3934368 2.1871642
Pelagophyceae_Svalbard Pelagophyceae_Svalbard_sp. 498 59.8075003 0.3833255
Pelagophyceae_XXX Pelagophyceae_XXX_sp. 21150 20.6051618 0.1155214
Sarcinochrysidaceae_X Sarcinochrysidaceae_X_sp. 566 46.5579201 0.0752083
Sarcinochrysis Sarcinochrysis_sp. 3056 45.9810724 0.1722144
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Warning: Removed 3 rows containing missing values (geom_point).
Warning: Removed 3 rows containing missing values (geom_point).
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Regions defined for each Polygons
Joining, by = "file_code"
Joining, by = "species"
genus species n_reads pct_class pct_total
------------------------- ----------------------------- -------- ---------- ----------
Aureococcus Aureococcus_anophagefferens 887561 22.788344 0.1125895
Pelagococcus Pelagococcus_sp. 25784 6.688237 0.0075565
Pelagomonadaceae_clade_A Pelagomonadaceae_clade_A_sp. 16250 7.147905 0.0029203
Pelagomonadaceae_X Pelagomonadaceae_X_sp. 18632 4.343436 0.0028249
Pelagomonas Pelagomonas_calceolata 3210461 72.359834 0.3401518
Sarcinochrysidaceae_X Sarcinochrysidaceae_X_sp. 473 4.335542 0.0011510
Warning: Removed 107 rows containing missing values (geom_point).
Regions defined for each Polygons
Warning: Removed 19 rows containing missing values (geom_point).
Warning: Removed 19 rows containing missing values (geom_point).
Regions defined for each Polygons
Warning: Removed 7 rows containing missing values (geom_point).
Warning: Removed 14 rows containing missing values (geom_point).
Warning: Removed 7 rows containing missing values (geom_point).
Warning: Removed 14 rows containing missing values (geom_point).
Regions defined for each Polygons
Warning: Removed 9 rows containing missing values (geom_point).
Warning: Removed 9 rows containing missing values (geom_point).
Warning: Removed 9 rows containing missing values (geom_point).
Warning: Removed 9 rows containing missing values (geom_point).
Regions defined for each Polygons
Warning: Removed 9 rows containing missing values (geom_point).
Warning: Removed 6 rows containing missing values (geom_point).
Warning: Removed 9 rows containing missing values (geom_point).
Warning: Removed 6 rows containing missing values (geom_point).
Regions defined for each Polygons
Warning: Removed 13 rows containing missing values (geom_point).
Warning: Removed 13 rows containing missing values (geom_point).
Warning: Removed 13 rows containing missing values (geom_point).
Warning: Removed 13 rows containing missing values (geom_point).
Regions defined for each Polygons
Warning: Removed 5 rows containing missing values (geom_point).
Warning: Removed 3 rows containing missing values (geom_point).
Warning: Removed 5 rows containing missing values (geom_point).
Warning: Removed 3 rows containing missing values (geom_point).
Regions defined for each Polygons
Warning: Removed 11 rows containing missing values (geom_point).
Warning: Removed 3 rows containing missing values (geom_point).
Warning: Removed 11 rows containing missing values (geom_point).
Warning: Removed 3 rows containing missing values (geom_point).
Regions defined for each Polygons
Warning: Removed 15 rows containing missing values (geom_point).
Warning: Removed 15 rows containing missing values (geom_point).
Regions defined for each Polygons
Warning: Removed 14 rows containing missing values (geom_point).
Warning: Removed 14 rows containing missing values (geom_point).
Regions defined for each Polygons
Warning: Removed 12 rows containing missing values (geom_point).
Warning: Removed 12 rows containing missing values (geom_point).
Regions defined for each Polygons
Warning: Removed 23 rows containing missing values (geom_point).
Warning: Removed 23 rows containing missing values (geom_point).
Regions defined for each Polygons
Warning: Removed 7 rows containing missing values (geom_point).
Warning: Removed 7 rows containing missing values (geom_point).
Preliminary analyses - DO NOT RUN
Check that all sequence with same hash have same taxonomy
This check was done before the hash analysis was done
asv_sequence_V4_hash_repeated <- asv_set_V4$fasta %>%
group_by(sequence_hash, species) %>%
summarise(n = n()) %>%
filter(n>1) %>%
arrange(sequence_hash)
asv_sequence_V4_all <- asv_set_V4$fasta %>%
arrange(sequence_hash)
asv_sequence_V4_dereplicated <- asv_set_V4$fasta %>%
group_by(sequence_hash) %>%
slice(1) %>%
ungroup()